{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "598b6054-6147-44b1-a3f2-35ae1f7828ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import pandas as pd\n",
    "from scipy.stats import mannwhitneyu\n",
    "from statsmodels.sandbox.stats.multicomp import multipletests\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from skbio.stats.composition import clr\n",
    "from matplotlib_venn import venn3\n",
    "\n",
    "sns.set_context('talk')\n",
    "sns.set(rc={\"figure.dpi\":150, 'savefig.dpi':300})\n",
    "sns.set_style('whitegrid')\n",
    "\n",
    "def p_adjust(pvalues, method='fdr_bh'):\n",
    "    res = multipletests(pvalues, method=method)\n",
    "    return np.array(res[1], dtype=float)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b4d9bb0-fa06-4898-aca1-d72c51431e2c",
   "metadata": {},
   "source": [
    "# Metabolomics associations\n",
    "##### 7/18/22\n",
    "##### Michael Shaffer\n",
    "##### Merck ESC, Sys bio group\n",
    "\n",
    "After some success with correlating KOs with continuous titer we decided to use the same approach on the metabolomics data. This data table came from Tom directly."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "396ba13c-ce03-4289-8e8a-5a1ebb4e0f3e",
   "metadata": {},
   "source": [
    "## Read in data\n",
    "\n",
    "Read in the excel table from Tom and fill out empty cells with zeros."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ae77ebe4-ab21-4f4b-b6f7-7023b8acf0a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GCDCA</th>\n",
       "      <th>GDCA</th>\n",
       "      <th>GHDCA or GUDCA</th>\n",
       "      <th>CA</th>\n",
       "      <th>TCDCA</th>\n",
       "      <th>TCA</th>\n",
       "      <th>CDCA</th>\n",
       "      <th>7oxoCA</th>\n",
       "      <th>TUDCA</th>\n",
       "      <th>UDCA or HDCA</th>\n",
       "      <th>...</th>\n",
       "      <th>Uracil</th>\n",
       "      <th>Adenosine</th>\n",
       "      <th>Phenaceturic acid</th>\n",
       "      <th>N-Acetyl D-galactosamine</th>\n",
       "      <th>Pyridoxal hydrochloride</th>\n",
       "      <th>2-Deoxyuridine</th>\n",
       "      <th>Vanillic acid</th>\n",
       "      <th>Thymine</th>\n",
       "      <th>Nicotinic acid</th>\n",
       "      <th>Homocitrate</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Compound name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P103_V12_05052020</th>\n",
       "      <td>13165.576990</td>\n",
       "      <td>6486.205847</td>\n",
       "      <td>1037.427698</td>\n",
       "      <td>877.477368</td>\n",
       "      <td>842.441112</td>\n",
       "      <td>428.399288</td>\n",
       "      <td>412.663454</td>\n",
       "      <td>6.309238</td>\n",
       "      <td>26.103778</td>\n",
       "      <td>37.617317</td>\n",
       "      <td>...</td>\n",
       "      <td>571</td>\n",
       "      <td>112</td>\n",
       "      <td>45</td>\n",
       "      <td>90</td>\n",
       "      <td>426</td>\n",
       "      <td>1223</td>\n",
       "      <td>6865</td>\n",
       "      <td>549</td>\n",
       "      <td>217</td>\n",
       "      <td>3044</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P106_V9_04022019</th>\n",
       "      <td>5324.081005</td>\n",
       "      <td>615.267690</td>\n",
       "      <td>1431.812977</td>\n",
       "      <td>536.910350</td>\n",
       "      <td>477.807838</td>\n",
       "      <td>761.254736</td>\n",
       "      <td>55.773899</td>\n",
       "      <td>4.142229</td>\n",
       "      <td>91.815489</td>\n",
       "      <td>19.861706</td>\n",
       "      <td>...</td>\n",
       "      <td>400</td>\n",
       "      <td>368</td>\n",
       "      <td>206</td>\n",
       "      <td>217</td>\n",
       "      <td>201</td>\n",
       "      <td>372</td>\n",
       "      <td>53</td>\n",
       "      <td>230</td>\n",
       "      <td>216</td>\n",
       "      <td>109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P107_A1_04152019</th>\n",
       "      <td>11967.195030</td>\n",
       "      <td>3.048386</td>\n",
       "      <td>8876.579286</td>\n",
       "      <td>884.309058</td>\n",
       "      <td>2720.613568</td>\n",
       "      <td>3492.197205</td>\n",
       "      <td>106.185518</td>\n",
       "      <td>7.807234</td>\n",
       "      <td>664.956760</td>\n",
       "      <td>113.460800</td>\n",
       "      <td>...</td>\n",
       "      <td>594</td>\n",
       "      <td>112</td>\n",
       "      <td>8</td>\n",
       "      <td>330</td>\n",
       "      <td>102</td>\n",
       "      <td>9</td>\n",
       "      <td>87</td>\n",
       "      <td>55</td>\n",
       "      <td>177</td>\n",
       "      <td>6952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V9_04022019</th>\n",
       "      <td>117.128627</td>\n",
       "      <td>-0.240781</td>\n",
       "      <td>116.125243</td>\n",
       "      <td>29.790592</td>\n",
       "      <td>6.916959</td>\n",
       "      <td>5.775257</td>\n",
       "      <td>1.574994</td>\n",
       "      <td>0.418815</td>\n",
       "      <td>1.671431</td>\n",
       "      <td>4.996911</td>\n",
       "      <td>...</td>\n",
       "      <td>60</td>\n",
       "      <td>226</td>\n",
       "      <td>11</td>\n",
       "      <td>16</td>\n",
       "      <td>249</td>\n",
       "      <td>104</td>\n",
       "      <td>64</td>\n",
       "      <td>18</td>\n",
       "      <td>24</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V12_05212020</th>\n",
       "      <td>16651.224940</td>\n",
       "      <td>1707.969274</td>\n",
       "      <td>3575.590198</td>\n",
       "      <td>1277.281488</td>\n",
       "      <td>2356.663104</td>\n",
       "      <td>1109.351448</td>\n",
       "      <td>204.781749</td>\n",
       "      <td>41.696046</td>\n",
       "      <td>216.707453</td>\n",
       "      <td>107.961759</td>\n",
       "      <td>...</td>\n",
       "      <td>375</td>\n",
       "      <td>184</td>\n",
       "      <td>205</td>\n",
       "      <td>141</td>\n",
       "      <td>253</td>\n",
       "      <td>702</td>\n",
       "      <td>298</td>\n",
       "      <td>196</td>\n",
       "      <td>196</td>\n",
       "      <td>3468</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                          GCDCA         GDCA  GHDCA or GUDCA           CA  \\\n",
       "Compound name                                                               \n",
       "P103_V12_05052020  13165.576990  6486.205847     1037.427698   877.477368   \n",
       "P106_V9_04022019    5324.081005   615.267690     1431.812977   536.910350   \n",
       "P107_A1_04152019   11967.195030     3.048386     8876.579286   884.309058   \n",
       "P108_V9_04022019     117.128627    -0.240781      116.125243    29.790592   \n",
       "P108_V12_05212020  16651.224940  1707.969274     3575.590198  1277.281488   \n",
       "\n",
       "                         TCDCA          TCA        CDCA     7oxoCA  \\\n",
       "Compound name                                                        \n",
       "P103_V12_05052020   842.441112   428.399288  412.663454   6.309238   \n",
       "P106_V9_04022019    477.807838   761.254736   55.773899   4.142229   \n",
       "P107_A1_04152019   2720.613568  3492.197205  106.185518   7.807234   \n",
       "P108_V9_04022019      6.916959     5.775257    1.574994   0.418815   \n",
       "P108_V12_05212020  2356.663104  1109.351448  204.781749  41.696046   \n",
       "\n",
       "                        TUDCA  UDCA or HDCA  ...  Uracil  Adenosine  \\\n",
       "Compound name                                ...                      \n",
       "P103_V12_05052020   26.103778     37.617317  ...     571        112   \n",
       "P106_V9_04022019    91.815489     19.861706  ...     400        368   \n",
       "P107_A1_04152019   664.956760    113.460800  ...     594        112   \n",
       "P108_V9_04022019     1.671431      4.996911  ...      60        226   \n",
       "P108_V12_05212020  216.707453    107.961759  ...     375        184   \n",
       "\n",
       "                   Phenaceturic acid  N-Acetyl D-galactosamine  \\\n",
       "Compound name                                                    \n",
       "P103_V12_05052020                 45                        90   \n",
       "P106_V9_04022019                 206                       217   \n",
       "P107_A1_04152019                   8                       330   \n",
       "P108_V9_04022019                  11                        16   \n",
       "P108_V12_05212020                205                       141   \n",
       "\n",
       "                   Pyridoxal hydrochloride  2-Deoxyuridine  Vanillic acid  \\\n",
       "Compound name                                                               \n",
       "P103_V12_05052020                      426            1223           6865   \n",
       "P106_V9_04022019                       201             372             53   \n",
       "P107_A1_04152019                       102               9             87   \n",
       "P108_V9_04022019                       249             104             64   \n",
       "P108_V12_05212020                      253             702            298   \n",
       "\n",
       "                   Thymine  Nicotinic acid  Homocitrate  \n",
       "Compound name                                            \n",
       "P103_V12_05052020      549             217         3044  \n",
       "P106_V9_04022019       230             216          109  \n",
       "P107_A1_04152019        55             177         6952  \n",
       "P108_V9_04022019        18              24           36  \n",
       "P108_V12_05212020      196             196         3468  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metab_data = pd.read_excel('../../data/metabolomics_abunds.xlsx', index_col=0).fillna(0)\n",
    "metab_data = metab_data.drop('2-Methyl-1-butanol', axis=1) # drop 2-methyl-1-butanol, bad annotation\n",
    "metab_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6fcdc66f-562b-44ef-aa90-68f9a2a511f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "111"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(metab_data.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1beeb59b-e8eb-4320-8ead-baff4dc6d0de",
   "metadata": {},
   "source": [
    "111 total compounds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e25e7c30-c055-461b-94fc-cf83819b00cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(metab_data < 0).sum().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96bf9ccc-21da-4366-9d65-c42295ef8d3d",
   "metadata": {},
   "source": [
    "There are only 34 negative values in the data set so not much to worry about."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "afa2e02b-9851-41bb-be46-7da0a98b70f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "        Compound name    \n",
       "GDCA    P108_V9_04022019    -0.240781\n",
       "        P134_V9_12062019    -3.462650\n",
       "        P136_V9_01092020    -3.398780\n",
       "        P136_V12_01042021   -0.059861\n",
       "        P202_V12_02252020   -1.708202\n",
       "        P212_V5_05232018    -2.539283\n",
       "        P215_V9_03292019    -4.984036\n",
       "        P215_V12_03302020   -2.424750\n",
       "        P217_V5_06122018    -1.613075\n",
       "        P217_V9_04122019    -0.517431\n",
       "        P218_V5_06062018    -0.427326\n",
       "        P220_V5_06132018    -0.218075\n",
       "        P224_V9_05292019    -0.396865\n",
       "        P229_V5_07022018    -0.664653\n",
       "        P235_V5_08152018    -0.203266\n",
       "        P237_V9_06242019    -0.407863\n",
       "        P239_V5_08292018    -0.942639\n",
       "        P246_V10_10032019   -1.541833\n",
       "        P250_V5_11062018    -1.424605\n",
       "        P250_V9_09092019    -2.380330\n",
       "        P252_V9_09102019    -1.033160\n",
       "        P256_V9_10082018    -0.539685\n",
       "        P258_V5_01032019    -1.193836\n",
       "        P260_V9_11072019    -7.410659\n",
       "        P261_V5_01032019    -2.489533\n",
       "        P263_V9_11262019    -2.335442\n",
       "7oxoCA  P253_V12_09242020   -2.383277\n",
       "DCA     P115_V12_05282020   -0.036132\n",
       "        P136_V9_01092020    -0.927228\n",
       "        P233_V5_07182018    -0.085570\n",
       "        P234_V5_08072018    -1.335333\n",
       "        P236_V5_08142018    -2.929766\n",
       "        P238_V9_06242019    -0.044959\n",
       "        P252_V9_09102019    -1.587543\n",
       "dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metab_data.unstack()[metab_data.unstack() < 0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38c1a43f-663f-4866-b3ef-cd701aa1ed47",
   "metadata": {},
   "source": [
    "There are negative numbers in the data set. Checked with Tom and he said that these can be treated as zeros. So do that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a69bd109-0485-4dd4-9b7b-b39fab00e911",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Replace negative numbers with 0 as Tom said in an email\n",
    "metab_data[metab_data < 0] = 0\n",
    "(metab_data == 0).sum().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6925d1d5-9370-4855-af1b-94b3a63700e7",
   "metadata": {},
   "source": [
    "Relative abundance normalized version of table is calculated by summing rows and dividing column values by totals. Makes it so that values are percent abundances of sample."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "758d39b1-abdb-407b-b02c-c0792f1d0d7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GCDCA</th>\n",
       "      <th>GDCA</th>\n",
       "      <th>GHDCA or GUDCA</th>\n",
       "      <th>CA</th>\n",
       "      <th>TCDCA</th>\n",
       "      <th>TCA</th>\n",
       "      <th>CDCA</th>\n",
       "      <th>7oxoCA</th>\n",
       "      <th>TUDCA</th>\n",
       "      <th>UDCA or HDCA</th>\n",
       "      <th>...</th>\n",
       "      <th>Uracil</th>\n",
       "      <th>Adenosine</th>\n",
       "      <th>Phenaceturic acid</th>\n",
       "      <th>N-Acetyl D-galactosamine</th>\n",
       "      <th>Pyridoxal hydrochloride</th>\n",
       "      <th>2-Deoxyuridine</th>\n",
       "      <th>Vanillic acid</th>\n",
       "      <th>Thymine</th>\n",
       "      <th>Nicotinic acid</th>\n",
       "      <th>Homocitrate</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Compound name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P103_V12_05052020</th>\n",
       "      <td>0.001680</td>\n",
       "      <td>8.275956e-04</td>\n",
       "      <td>0.000132</td>\n",
       "      <td>0.000112</td>\n",
       "      <td>0.000107</td>\n",
       "      <td>0.000055</td>\n",
       "      <td>5.265304e-05</td>\n",
       "      <td>8.050157e-07</td>\n",
       "      <td>3.330664e-06</td>\n",
       "      <td>0.000005</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000073</td>\n",
       "      <td>0.000014</td>\n",
       "      <td>0.000006</td>\n",
       "      <td>0.000011</td>\n",
       "      <td>0.000054</td>\n",
       "      <td>0.000156</td>\n",
       "      <td>0.000876</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000028</td>\n",
       "      <td>0.000388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P106_V9_04022019</th>\n",
       "      <td>0.000847</td>\n",
       "      <td>9.783810e-05</td>\n",
       "      <td>0.000228</td>\n",
       "      <td>0.000085</td>\n",
       "      <td>0.000076</td>\n",
       "      <td>0.000121</td>\n",
       "      <td>8.869005e-06</td>\n",
       "      <td>6.586853e-07</td>\n",
       "      <td>1.460024e-05</td>\n",
       "      <td>0.000003</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000064</td>\n",
       "      <td>0.000059</td>\n",
       "      <td>0.000033</td>\n",
       "      <td>0.000035</td>\n",
       "      <td>0.000032</td>\n",
       "      <td>0.000059</td>\n",
       "      <td>0.000008</td>\n",
       "      <td>0.000037</td>\n",
       "      <td>0.000034</td>\n",
       "      <td>0.000017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P107_A1_04152019</th>\n",
       "      <td>0.002120</td>\n",
       "      <td>5.401497e-07</td>\n",
       "      <td>0.001573</td>\n",
       "      <td>0.000157</td>\n",
       "      <td>0.000482</td>\n",
       "      <td>0.000619</td>\n",
       "      <td>1.881523e-05</td>\n",
       "      <td>1.383380e-06</td>\n",
       "      <td>1.178250e-04</td>\n",
       "      <td>0.000020</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000105</td>\n",
       "      <td>0.000020</td>\n",
       "      <td>0.000001</td>\n",
       "      <td>0.000058</td>\n",
       "      <td>0.000018</td>\n",
       "      <td>0.000002</td>\n",
       "      <td>0.000015</td>\n",
       "      <td>0.000010</td>\n",
       "      <td>0.000031</td>\n",
       "      <td>0.001232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V9_04022019</th>\n",
       "      <td>0.000065</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000065</td>\n",
       "      <td>0.000017</td>\n",
       "      <td>0.000004</td>\n",
       "      <td>0.000003</td>\n",
       "      <td>8.751132e-07</td>\n",
       "      <td>2.327061e-07</td>\n",
       "      <td>9.286966e-07</td>\n",
       "      <td>0.000003</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000033</td>\n",
       "      <td>0.000126</td>\n",
       "      <td>0.000006</td>\n",
       "      <td>0.000009</td>\n",
       "      <td>0.000138</td>\n",
       "      <td>0.000058</td>\n",
       "      <td>0.000036</td>\n",
       "      <td>0.000010</td>\n",
       "      <td>0.000013</td>\n",
       "      <td>0.000020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V12_05212020</th>\n",
       "      <td>0.002614</td>\n",
       "      <td>2.681638e-04</td>\n",
       "      <td>0.000561</td>\n",
       "      <td>0.000201</td>\n",
       "      <td>0.000370</td>\n",
       "      <td>0.000174</td>\n",
       "      <td>3.215225e-05</td>\n",
       "      <td>6.546587e-06</td>\n",
       "      <td>3.402467e-05</td>\n",
       "      <td>0.000017</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000059</td>\n",
       "      <td>0.000029</td>\n",
       "      <td>0.000032</td>\n",
       "      <td>0.000022</td>\n",
       "      <td>0.000040</td>\n",
       "      <td>0.000110</td>\n",
       "      <td>0.000047</td>\n",
       "      <td>0.000031</td>\n",
       "      <td>0.000031</td>\n",
       "      <td>0.000545</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      GCDCA          GDCA  GHDCA or GUDCA        CA     TCDCA  \\\n",
       "Compound name                                                                   \n",
       "P103_V12_05052020  0.001680  8.275956e-04        0.000132  0.000112  0.000107   \n",
       "P106_V9_04022019   0.000847  9.783810e-05        0.000228  0.000085  0.000076   \n",
       "P107_A1_04152019   0.002120  5.401497e-07        0.001573  0.000157  0.000482   \n",
       "P108_V9_04022019   0.000065  0.000000e+00        0.000065  0.000017  0.000004   \n",
       "P108_V12_05212020  0.002614  2.681638e-04        0.000561  0.000201  0.000370   \n",
       "\n",
       "                        TCA          CDCA        7oxoCA         TUDCA  \\\n",
       "Compound name                                                           \n",
       "P103_V12_05052020  0.000055  5.265304e-05  8.050157e-07  3.330664e-06   \n",
       "P106_V9_04022019   0.000121  8.869005e-06  6.586853e-07  1.460024e-05   \n",
       "P107_A1_04152019   0.000619  1.881523e-05  1.383380e-06  1.178250e-04   \n",
       "P108_V9_04022019   0.000003  8.751132e-07  2.327061e-07  9.286966e-07   \n",
       "P108_V12_05212020  0.000174  3.215225e-05  6.546587e-06  3.402467e-05   \n",
       "\n",
       "                   UDCA or HDCA  ...    Uracil  Adenosine  Phenaceturic acid  \\\n",
       "Compound name                    ...                                           \n",
       "P103_V12_05052020      0.000005  ...  0.000073   0.000014           0.000006   \n",
       "P106_V9_04022019       0.000003  ...  0.000064   0.000059           0.000033   \n",
       "P107_A1_04152019       0.000020  ...  0.000105   0.000020           0.000001   \n",
       "P108_V9_04022019       0.000003  ...  0.000033   0.000126           0.000006   \n",
       "P108_V12_05212020      0.000017  ...  0.000059   0.000029           0.000032   \n",
       "\n",
       "                   N-Acetyl D-galactosamine  Pyridoxal hydrochloride  \\\n",
       "Compound name                                                          \n",
       "P103_V12_05052020                  0.000011                 0.000054   \n",
       "P106_V9_04022019                   0.000035                 0.000032   \n",
       "P107_A1_04152019                   0.000058                 0.000018   \n",
       "P108_V9_04022019                   0.000009                 0.000138   \n",
       "P108_V12_05212020                  0.000022                 0.000040   \n",
       "\n",
       "                   2-Deoxyuridine  Vanillic acid   Thymine  Nicotinic acid  \\\n",
       "Compound name                                                                \n",
       "P103_V12_05052020        0.000156       0.000876  0.000070        0.000028   \n",
       "P106_V9_04022019         0.000059       0.000008  0.000037        0.000034   \n",
       "P107_A1_04152019         0.000002       0.000015  0.000010        0.000031   \n",
       "P108_V9_04022019         0.000058       0.000036  0.000010        0.000013   \n",
       "P108_V12_05212020        0.000110       0.000047  0.000031        0.000031   \n",
       "\n",
       "                   Homocitrate  \n",
       "Compound name                   \n",
       "P103_V12_05052020     0.000388  \n",
       "P106_V9_04022019      0.000017  \n",
       "P107_A1_04152019      0.001232  \n",
       "P108_V9_04022019      0.000020  \n",
       "P108_V12_05212020     0.000545  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metab_data_rel = metab_data.div(metab_data.sum(axis=1), axis=0)\n",
    "metab_data_rel.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9bcaeb76-e9a5-4c12-93b9-3d8c26568a4c",
   "metadata": {},
   "source": [
    "CLR normalized version of table is calculated using CLR (centered log ratio transformation) which controls for compositionality. This is done using the scikit-bio package."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0464f150-61f1-49ff-b44b-2f7e29552f7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "metab_data_clr = pd.DataFrame(clr(metab_data + .001), index=metab_data.index, columns=metab_data.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d27a752-cf90-4bdf-b80d-81f5e723401e",
   "metadata": {},
   "source": [
    "Build simple metadata table based on the sample names from the metabolomics table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "63e7c570-f966-4e62-a8ec-6638352c0044",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BabyN</th>\n",
       "      <th>VisitCode</th>\n",
       "      <th>VisitDate</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SampleID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P103_V12_05052020</th>\n",
       "      <td>103</td>\n",
       "      <td>V12</td>\n",
       "      <td>05052020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P106_V9_04022019</th>\n",
       "      <td>106</td>\n",
       "      <td>V9</td>\n",
       "      <td>04022019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P107_A1_04152019</th>\n",
       "      <td>107</td>\n",
       "      <td>A1</td>\n",
       "      <td>04152019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V9_04022019</th>\n",
       "      <td>108</td>\n",
       "      <td>V9</td>\n",
       "      <td>04022019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V12_05212020</th>\n",
       "      <td>108</td>\n",
       "      <td>V12</td>\n",
       "      <td>05212020</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   BabyN VisitCode VisitDate\n",
       "SampleID                                    \n",
       "P103_V12_05052020    103       V12  05052020\n",
       "P106_V9_04022019     106        V9  04022019\n",
       "P107_A1_04152019     107        A1  04152019\n",
       "P108_V9_04022019     108        V9  04022019\n",
       "P108_V12_05212020    108       V12  05212020"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_rows = [[i, int(i.split('_')[0].strip('P')), i.split('_')[1], i.split('_')[-1]] for i in metab_data.index]\n",
    "meta_base = pd.DataFrame(meta_rows, columns=['SampleID', 'BabyN', 'VisitCode', 'VisitDate']).set_index('SampleID')\n",
    "meta_base.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "949d7209-89e7-4db9-afc6-1ccc4c674bf4",
   "metadata": {},
   "source": [
    "Connect sample data to 1 yr titer data via the baby numbers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e5b53321-c5d2-4289-9b6c-3e9a6a4f7fe6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PT</th>\n",
       "      <th>Dip</th>\n",
       "      <th>FHA</th>\n",
       "      <th>PRN</th>\n",
       "      <th>TET</th>\n",
       "      <th>PRP (Hib)</th>\n",
       "      <th>PCV ST1</th>\n",
       "      <th>PCV ST3</th>\n",
       "      <th>PCV ST4</th>\n",
       "      <th>PCV ST5</th>\n",
       "      <th>...</th>\n",
       "      <th>median_mmNorm</th>\n",
       "      <th>median_mmNorm_DTAPHib</th>\n",
       "      <th>median_mmNorm_PCV</th>\n",
       "      <th>PT_protected</th>\n",
       "      <th>Dip_protected</th>\n",
       "      <th>FHA_protected</th>\n",
       "      <th>PRN_protected</th>\n",
       "      <th>TET_protected</th>\n",
       "      <th>PRP (Hib)_protected</th>\n",
       "      <th>VR_group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.21</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.30</td>\n",
       "      <td>0.39</td>\n",
       "      <td>141.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>139.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.44</td>\n",
       "      <td>3.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.52</td>\n",
       "      <td>1.60</td>\n",
       "      <td>2430.0</td>\n",
       "      <td>415.0</td>\n",
       "      <td>194.0</td>\n",
       "      <td>332.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.449483</td>\n",
       "      <td>0.114018</td>\n",
       "      <td>0.958142</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.27</td>\n",
       "      <td>21.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.003102</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>LVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>27.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>63.0</td>\n",
       "      <td>1.35</td>\n",
       "      <td>7.02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.700925</td>\n",
       "      <td>0.763049</td>\n",
       "      <td>0.486810</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>14.0</td>\n",
       "      <td>0.24</td>\n",
       "      <td>15.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2.45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>301.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>400.0</td>\n",
       "      <td>289.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.266219</td>\n",
       "      <td>0.284211</td>\n",
       "      <td>0.245121</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       PT   Dip   FHA   PRN   TET  PRP (Hib)  PCV ST1  PCV ST3  PCV ST4  \\\n",
       "106   2.5  0.21  11.0   2.5  0.30       0.39    141.0     35.0     56.0   \n",
       "107   2.5  0.44   3.0   9.0  0.52       1.60   2430.0    415.0    194.0   \n",
       "108   2.5  0.05   1.5   2.5  0.05       0.27     21.0      3.0     24.0   \n",
       "109  27.0   NaN   NaN  63.0  1.35       7.02      NaN      NaN      NaN   \n",
       "110  14.0  0.24  15.0  20.0  2.45        NaN    301.0     63.0    400.0   \n",
       "\n",
       "     PCV ST5  ...  median_mmNorm  median_mmNorm_DTAPHib  median_mmNorm_PCV  \\\n",
       "106    139.0  ...       0.061955               0.052874           0.061955   \n",
       "107    332.0  ...       0.449483               0.114018           0.958142   \n",
       "108     41.0  ...       0.000000               0.000000           0.003102   \n",
       "109      NaN  ...       0.700925               0.763049           0.486810   \n",
       "110    289.0  ...       0.266219               0.284211           0.245121   \n",
       "\n",
       "     PT_protected  Dip_protected  FHA_protected  PRN_protected  TET_protected  \\\n",
       "106         False           True           True          False           True   \n",
       "107         False           True          False           True           True   \n",
       "108         False          False          False          False          False   \n",
       "109          True          False          False           True           True   \n",
       "110          True           True           True           True           True   \n",
       "\n",
       "     PRP (Hib)_protected  VR_group  \n",
       "106                 True       NVR  \n",
       "107                 True       NVR  \n",
       "108                 True       LVR  \n",
       "109                 True       NVR  \n",
       "110                False       NVR  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titer_data = pd.read_csv('../../data/vaccine_response/vaccine_response_y1.tsv', sep='\\t', index_col=0)\n",
    "titer_data.index = [int(i.split('Baby')[-1]) for i in titer_data.index]\n",
    "titer_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c54eec5d-efc0-401d-bc27-efc2ce77a949",
   "metadata": {},
   "source": [
    "Use BabyN to connect compound abundance to titer values since there is only one 2 month sample and one set of 1 year titer values per baby."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4a7b806f-edf7-481a-8870-2e5ccd59a03c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PT</th>\n",
       "      <th>Dip</th>\n",
       "      <th>FHA</th>\n",
       "      <th>PRN</th>\n",
       "      <th>TET</th>\n",
       "      <th>PRP (Hib)</th>\n",
       "      <th>PCV ST1</th>\n",
       "      <th>PCV ST3</th>\n",
       "      <th>PCV ST4</th>\n",
       "      <th>PCV ST5</th>\n",
       "      <th>...</th>\n",
       "      <th>median_mmNorm</th>\n",
       "      <th>median_mmNorm_DTAPHib</th>\n",
       "      <th>median_mmNorm_PCV</th>\n",
       "      <th>PT_protected</th>\n",
       "      <th>Dip_protected</th>\n",
       "      <th>FHA_protected</th>\n",
       "      <th>PRN_protected</th>\n",
       "      <th>TET_protected</th>\n",
       "      <th>PRP (Hib)_protected</th>\n",
       "      <th>VR_group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P106_V9_04022019</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.21</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.39</td>\n",
       "      <td>141.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>139.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P107_A1_04152019</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.44</td>\n",
       "      <td>3.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.52</td>\n",
       "      <td>1.6</td>\n",
       "      <td>2430.0</td>\n",
       "      <td>415.0</td>\n",
       "      <td>194.0</td>\n",
       "      <td>332.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.449483</td>\n",
       "      <td>0.114018</td>\n",
       "      <td>0.958142</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V9_04022019</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.27</td>\n",
       "      <td>21.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.003102</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>LVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V12_05212020</th>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.27</td>\n",
       "      <td>21.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.003102</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>LVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P110_V9_05172019</th>\n",
       "      <td>14.0</td>\n",
       "      <td>0.24</td>\n",
       "      <td>15.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2.45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>301.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>400.0</td>\n",
       "      <td>289.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.266219</td>\n",
       "      <td>0.284211</td>\n",
       "      <td>0.245121</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     PT   Dip   FHA   PRN   TET PRP (Hib) PCV ST1 PCV ST3  \\\n",
       "P106_V9_04022019    2.5  0.21  11.0   2.5   0.3      0.39   141.0    35.0   \n",
       "P107_A1_04152019    2.5  0.44   3.0   9.0  0.52       1.6  2430.0   415.0   \n",
       "P108_V9_04022019    2.5  0.05   1.5   2.5  0.05      0.27    21.0     3.0   \n",
       "P108_V12_05212020   2.5  0.05   1.5   2.5  0.05      0.27    21.0     3.0   \n",
       "P110_V9_05172019   14.0  0.24  15.0  20.0  2.45       NaN   301.0    63.0   \n",
       "\n",
       "                  PCV ST4 PCV ST5  ... median_mmNorm median_mmNorm_DTAPHib  \\\n",
       "P106_V9_04022019     56.0   139.0  ...      0.061955              0.052874   \n",
       "P107_A1_04152019    194.0   332.0  ...      0.449483              0.114018   \n",
       "P108_V9_04022019     24.0    41.0  ...           0.0                   0.0   \n",
       "P108_V12_05212020    24.0    41.0  ...           0.0                   0.0   \n",
       "P110_V9_05172019    400.0   289.0  ...      0.266219              0.284211   \n",
       "\n",
       "                  median_mmNorm_PCV PT_protected Dip_protected FHA_protected  \\\n",
       "P106_V9_04022019           0.061955        False          True          True   \n",
       "P107_A1_04152019           0.958142        False          True         False   \n",
       "P108_V9_04022019           0.003102        False         False         False   \n",
       "P108_V12_05212020          0.003102        False         False         False   \n",
       "P110_V9_05172019           0.245121         True          True          True   \n",
       "\n",
       "                  PRN_protected TET_protected PRP (Hib)_protected VR_group  \n",
       "P106_V9_04022019          False          True                True      NVR  \n",
       "P107_A1_04152019           True          True                True      NVR  \n",
       "P108_V9_04022019          False         False                True      LVR  \n",
       "P108_V12_05212020         False         False                True      LVR  \n",
       "P110_V9_05172019           True          True               False      NVR  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "per_sample_titer_data = pd.DataFrame({sample: titer_data.loc[i] for sample, i in meta_base['BabyN'].iteritems() if i in titer_data.index}).transpose()\n",
    "per_sample_titer_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b4cf895-d716-479f-94bd-66fb124d9230",
   "metadata": {},
   "source": [
    "Merge sample metadata and titer data, filter out samples without one year titers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "20f8a3a2-43fd-4ba8-9b87-9f5da5d6dd72",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BabyN</th>\n",
       "      <th>VisitCode</th>\n",
       "      <th>VisitDate</th>\n",
       "      <th>PT</th>\n",
       "      <th>Dip</th>\n",
       "      <th>FHA</th>\n",
       "      <th>PRN</th>\n",
       "      <th>TET</th>\n",
       "      <th>PRP (Hib)</th>\n",
       "      <th>PCV ST1</th>\n",
       "      <th>...</th>\n",
       "      <th>median_mmNorm</th>\n",
       "      <th>median_mmNorm_DTAPHib</th>\n",
       "      <th>median_mmNorm_PCV</th>\n",
       "      <th>PT_protected</th>\n",
       "      <th>Dip_protected</th>\n",
       "      <th>FHA_protected</th>\n",
       "      <th>PRN_protected</th>\n",
       "      <th>TET_protected</th>\n",
       "      <th>PRP (Hib)_protected</th>\n",
       "      <th>VR_group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>P106_V9_04022019</th>\n",
       "      <td>106</td>\n",
       "      <td>V9</td>\n",
       "      <td>04022019</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.21</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.39</td>\n",
       "      <td>141.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P107_A1_04152019</th>\n",
       "      <td>107</td>\n",
       "      <td>A1</td>\n",
       "      <td>04152019</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.44</td>\n",
       "      <td>3.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.52</td>\n",
       "      <td>1.6</td>\n",
       "      <td>2430.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.449483</td>\n",
       "      <td>0.114018</td>\n",
       "      <td>0.958142</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V9_04022019</th>\n",
       "      <td>108</td>\n",
       "      <td>V9</td>\n",
       "      <td>04022019</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.27</td>\n",
       "      <td>21.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.003102</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>LVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P108_V12_05212020</th>\n",
       "      <td>108</td>\n",
       "      <td>V12</td>\n",
       "      <td>05212020</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.27</td>\n",
       "      <td>21.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.003102</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>LVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P110_V9_05172019</th>\n",
       "      <td>110</td>\n",
       "      <td>V9</td>\n",
       "      <td>05172019</td>\n",
       "      <td>14.0</td>\n",
       "      <td>0.24</td>\n",
       "      <td>15.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>2.45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>301.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.266219</td>\n",
       "      <td>0.284211</td>\n",
       "      <td>0.245121</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 51 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   BabyN VisitCode VisitDate    PT   Dip   FHA   PRN   TET  \\\n",
       "P106_V9_04022019     106        V9  04022019   2.5  0.21  11.0   2.5   0.3   \n",
       "P107_A1_04152019     107        A1  04152019   2.5  0.44   3.0   9.0  0.52   \n",
       "P108_V9_04022019     108        V9  04022019   2.5  0.05   1.5   2.5  0.05   \n",
       "P108_V12_05212020    108       V12  05212020   2.5  0.05   1.5   2.5  0.05   \n",
       "P110_V9_05172019     110        V9  05172019  14.0  0.24  15.0  20.0  2.45   \n",
       "\n",
       "                  PRP (Hib) PCV ST1  ... median_mmNorm median_mmNorm_DTAPHib  \\\n",
       "P106_V9_04022019       0.39   141.0  ...      0.061955              0.052874   \n",
       "P107_A1_04152019        1.6  2430.0  ...      0.449483              0.114018   \n",
       "P108_V9_04022019       0.27    21.0  ...           0.0                   0.0   \n",
       "P108_V12_05212020      0.27    21.0  ...           0.0                   0.0   \n",
       "P110_V9_05172019        NaN   301.0  ...      0.266219              0.284211   \n",
       "\n",
       "                  median_mmNorm_PCV PT_protected Dip_protected FHA_protected  \\\n",
       "P106_V9_04022019           0.061955        False          True          True   \n",
       "P107_A1_04152019           0.958142        False          True         False   \n",
       "P108_V9_04022019           0.003102        False         False         False   \n",
       "P108_V12_05212020          0.003102        False         False         False   \n",
       "P110_V9_05172019           0.245121         True          True          True   \n",
       "\n",
       "                  PRN_protected TET_protected PRP (Hib)_protected VR_group  \n",
       "P106_V9_04022019          False          True                True      NVR  \n",
       "P107_A1_04152019           True          True                True      NVR  \n",
       "P108_V9_04022019          False         False                True      LVR  \n",
       "P108_V12_05212020         False         False                True      LVR  \n",
       "P110_V9_05172019           True          True               False      NVR  \n",
       "\n",
       "[5 rows x 51 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta = pd.concat([meta_base, per_sample_titer_data], axis=1)\n",
    "meta = meta.loc[~pd.isna(meta['VR_group'])]\n",
    "meta.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf45137b-050f-4ab9-89b1-bbe6296846b9",
   "metadata": {},
   "source": [
    "Find shared samples between the metadata and metabolomics data. Filter metadata to match the metabolomics."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d22833fb-4d09-4de6-a803-5662cdf76539",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(141, 51)\n"
     ]
    }
   ],
   "source": [
    "in_both = list(set(meta.index) & set(metab_data.index))\n",
    "meta_matched = meta.loc[in_both]\n",
    "print(meta_matched.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5c6f28aa-5692-4347-a4d5-072a5d3e2785",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "V9     52\n",
       "V12    44\n",
       "V5     36\n",
       "V10     4\n",
       "A1      2\n",
       "S3      1\n",
       "A2      1\n",
       "A3      1\n",
       "Name: VisitCode, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_matched['VisitCode'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55a439b0-b9da-4b27-afcd-caca8dff4d4e",
   "metadata": {},
   "source": [
    "Plurality of samples from 1 year. Many from 2 months and 2 years as well. We will focus on two months (V5) and 1 year (V9)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc99daf9-3980-4c54-a718-e84635f9fa20",
   "metadata": {},
   "source": [
    "## Do correlation with un-normalized data\n",
    "\n",
    "First take the \"raw\" data (not further normalized from what Tom gave me) and correlate each feature with the mei."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4751f63-b098-4ef0-8b56-eeb683a12052",
   "metadata": {},
   "source": [
    "### 2 months (V5) correlations\n",
    "\n",
    "First focus on the 2 month time points.\n",
    "\n",
    "We are using spearman for all of these correlations and multiple testing correction with Benjamini-Hochberg."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "aded8164-fcd1-4dab-a585-e275a7525ebb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>metabolite</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>serotonin</td>\n",
       "      <td>21332.737790</td>\n",
       "      <td>10654.264478</td>\n",
       "      <td>109.0</td>\n",
       "      <td>0.020677</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>5-HIAA</td>\n",
       "      <td>4568.590914</td>\n",
       "      <td>2555.333651</td>\n",
       "      <td>106.0</td>\n",
       "      <td>0.033036</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>Phenylpyruvic acid</td>\n",
       "      <td>4399.250000</td>\n",
       "      <td>2038.093750</td>\n",
       "      <td>102.0</td>\n",
       "      <td>0.057143</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>4-hydroxyphenylacetate</td>\n",
       "      <td>6244.250000</td>\n",
       "      <td>2738.843750</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.073926</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>Maleic acid</td>\n",
       "      <td>13727.750000</td>\n",
       "      <td>9218.843750</td>\n",
       "      <td>98.0</td>\n",
       "      <td>0.092488</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                metabolite      LVR_mean      NVR_mean  statistic   p_value  \\\n",
       "13               serotonin  21332.737790  10654.264478      109.0  0.020677   \n",
       "12                  5-HIAA   4568.590914   2555.333651      106.0  0.033036   \n",
       "81      Phenylpyruvic acid   4399.250000   2038.093750      102.0  0.057143   \n",
       "53  4-hydroxyphenylacetate   6244.250000   2738.843750      100.0  0.073926   \n",
       "40             Maleic acid  13727.750000   9218.843750       98.0  0.092488   \n",
       "\n",
       "    p_adj  \n",
       "13    1.0  \n",
       "12    1.0  \n",
       "81    1.0  \n",
       "53    1.0  \n",
       "40    1.0  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_v5 = meta_matched.query(\"VisitCode == 'V5'\")\n",
    "metab_data_v5 = metab_data.loc[meta_v5.index]\n",
    "metab_stats_v5_rows = list()\n",
    "for metabolite, row in metab_data_v5.transpose().iterrows():\n",
    "    lvr_abunds = row[meta_v5.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v5.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        metab_stats_v5_rows.append([metabolite, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "metab_stats_v5 = pd.DataFrame(metab_stats_v5_rows, columns=['metabolite', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "metab_stats_v5['p_adj'] = p_adjust(metab_stats_v5['p_value'])\n",
    "metab_stats_v5.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1065224c-79e6-4d85-8d58-952e564594ae",
   "metadata": {},
   "source": [
    "### 1 year (V9) correlations\n",
    "\n",
    "Same correlations but with year 1 un-normalized data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "d45b2557-fc9d-42a9-ac98-babe8ba4d779",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>metabolite</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>L-Citrulline</th>\n",
       "      <td>2723.800000</td>\n",
       "      <td>6113.428571</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0.000013</td>\n",
       "      <td>0.000827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Serine</th>\n",
       "      <td>4132.800000</td>\n",
       "      <td>11121.142857</td>\n",
       "      <td>23.0</td>\n",
       "      <td>0.000015</td>\n",
       "      <td>0.000827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Histidine</th>\n",
       "      <td>433.700000</td>\n",
       "      <td>955.476190</td>\n",
       "      <td>40.0</td>\n",
       "      <td>0.000083</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-asparagine</th>\n",
       "      <td>839.700000</td>\n",
       "      <td>1450.380952</td>\n",
       "      <td>43.5</td>\n",
       "      <td>0.000116</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Methionine</th>\n",
       "      <td>8814.600000</td>\n",
       "      <td>18713.357143</td>\n",
       "      <td>45.0</td>\n",
       "      <td>0.000134</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Threonine</th>\n",
       "      <td>18041.100000</td>\n",
       "      <td>40270.000000</td>\n",
       "      <td>46.0</td>\n",
       "      <td>0.000147</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Proline</th>\n",
       "      <td>3105.500000</td>\n",
       "      <td>7522.000000</td>\n",
       "      <td>48.0</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Tyrosine</th>\n",
       "      <td>21412.700000</td>\n",
       "      <td>41665.071429</td>\n",
       "      <td>48.0</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.002456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Glutamine</th>\n",
       "      <td>8473.300000</td>\n",
       "      <td>16422.500000</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0.000213</td>\n",
       "      <td>0.002625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>serotonin</th>\n",
       "      <td>397.631584</td>\n",
       "      <td>3892.661440</td>\n",
       "      <td>52.0</td>\n",
       "      <td>0.000255</td>\n",
       "      <td>0.002820</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hydroxyphenyllactate</th>\n",
       "      <td>5792.300000</td>\n",
       "      <td>9667.690476</td>\n",
       "      <td>53.0</td>\n",
       "      <td>0.000279</td>\n",
       "      <td>0.002820</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Phenylalanine</th>\n",
       "      <td>311875.800000</td>\n",
       "      <td>537173.738095</td>\n",
       "      <td>56.0</td>\n",
       "      <td>0.000365</td>\n",
       "      <td>0.003379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Isoleucine</th>\n",
       "      <td>2966.300000</td>\n",
       "      <td>5936.833333</td>\n",
       "      <td>57.0</td>\n",
       "      <td>0.000399</td>\n",
       "      <td>0.003406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Leucine</th>\n",
       "      <td>820.000000</td>\n",
       "      <td>1784.023810</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0.000435</td>\n",
       "      <td>0.003452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D-pantothenic acid</th>\n",
       "      <td>4382.900000</td>\n",
       "      <td>8797.238095</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.000518</td>\n",
       "      <td>0.003835</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>indole lactate</th>\n",
       "      <td>1281.800000</td>\n",
       "      <td>2266.285714</td>\n",
       "      <td>64.0</td>\n",
       "      <td>0.000729</td>\n",
       "      <td>0.005060</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>xanthurenic acid</th>\n",
       "      <td>3253.249323</td>\n",
       "      <td>6438.697378</td>\n",
       "      <td>65.0</td>\n",
       "      <td>0.000794</td>\n",
       "      <td>0.005182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>N-Acetylneuraminic acid</th>\n",
       "      <td>1928.000000</td>\n",
       "      <td>3250.357143</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.000863</td>\n",
       "      <td>0.005321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CDCA</th>\n",
       "      <td>73.860439</td>\n",
       "      <td>361.874311</td>\n",
       "      <td>68.0</td>\n",
       "      <td>0.001018</td>\n",
       "      <td>0.005950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Glutamic acid</th>\n",
       "      <td>235614.900000</td>\n",
       "      <td>385352.714286</td>\n",
       "      <td>71.0</td>\n",
       "      <td>0.001301</td>\n",
       "      <td>0.007222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Aspartic Acid</th>\n",
       "      <td>25975.100000</td>\n",
       "      <td>64397.785714</td>\n",
       "      <td>72.0</td>\n",
       "      <td>0.001410</td>\n",
       "      <td>0.007455</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>indole acetic acid</th>\n",
       "      <td>4774.500000</td>\n",
       "      <td>9107.547619</td>\n",
       "      <td>73.0</td>\n",
       "      <td>0.001528</td>\n",
       "      <td>0.007647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3-Indoleacetic acid</th>\n",
       "      <td>4778.000000</td>\n",
       "      <td>9098.642857</td>\n",
       "      <td>74.0</td>\n",
       "      <td>0.001655</td>\n",
       "      <td>0.007647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DL-2-Aminoadipic acid</th>\n",
       "      <td>388.100000</td>\n",
       "      <td>883.119048</td>\n",
       "      <td>75.0</td>\n",
       "      <td>0.001791</td>\n",
       "      <td>0.007647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Kynurenine</th>\n",
       "      <td>9827.900000</td>\n",
       "      <td>17430.357143</td>\n",
       "      <td>75.0</td>\n",
       "      <td>0.001791</td>\n",
       "      <td>0.007647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2-Deoxycytidine 5-diphosphate</th>\n",
       "      <td>2615.500000</td>\n",
       "      <td>5332.761905</td>\n",
       "      <td>75.0</td>\n",
       "      <td>0.001791</td>\n",
       "      <td>0.007647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3-phenyllactic acid</th>\n",
       "      <td>4150.900000</td>\n",
       "      <td>7458.809524</td>\n",
       "      <td>78.0</td>\n",
       "      <td>0.002264</td>\n",
       "      <td>0.009308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5-HIAA</th>\n",
       "      <td>1337.464751</td>\n",
       "      <td>3904.702241</td>\n",
       "      <td>79.0</td>\n",
       "      <td>0.002446</td>\n",
       "      <td>0.009696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CA</th>\n",
       "      <td>446.812179</td>\n",
       "      <td>2036.955763</td>\n",
       "      <td>81.0</td>\n",
       "      <td>0.002849</td>\n",
       "      <td>0.010906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Tryptophan</th>\n",
       "      <td>110050.900000</td>\n",
       "      <td>168568.619048</td>\n",
       "      <td>85.0</td>\n",
       "      <td>0.003844</td>\n",
       "      <td>0.013765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4-Pyridoxic acid</th>\n",
       "      <td>2240.600000</td>\n",
       "      <td>4977.880952</td>\n",
       "      <td>85.0</td>\n",
       "      <td>0.003844</td>\n",
       "      <td>0.013765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-tryptophan-D5</th>\n",
       "      <td>26618.000000</td>\n",
       "      <td>18953.452381</td>\n",
       "      <td>332.0</td>\n",
       "      <td>0.004787</td>\n",
       "      <td>0.016606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Thymine</th>\n",
       "      <td>104.900000</td>\n",
       "      <td>196.333333</td>\n",
       "      <td>89.0</td>\n",
       "      <td>0.005144</td>\n",
       "      <td>0.017304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nicotinic acid</th>\n",
       "      <td>197.600000</td>\n",
       "      <td>1158.261905</td>\n",
       "      <td>90.0</td>\n",
       "      <td>0.005521</td>\n",
       "      <td>0.018026</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mevalonic acid</th>\n",
       "      <td>40525.200000</td>\n",
       "      <td>153637.880952</td>\n",
       "      <td>91.0</td>\n",
       "      <td>0.005935</td>\n",
       "      <td>0.018822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Creatinine</th>\n",
       "      <td>2660.200000</td>\n",
       "      <td>3769.761905</td>\n",
       "      <td>94.0</td>\n",
       "      <td>0.007322</td>\n",
       "      <td>0.022577</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>N-Acetyl-alpha-D-glucosamine 1-phosphate</th>\n",
       "      <td>3867.500000</td>\n",
       "      <td>5942.404762</td>\n",
       "      <td>95.0</td>\n",
       "      <td>0.007848</td>\n",
       "      <td>0.023544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Orotic acid</th>\n",
       "      <td>661.700000</td>\n",
       "      <td>1405.928571</td>\n",
       "      <td>96.5</td>\n",
       "      <td>0.008698</td>\n",
       "      <td>0.025407</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L-Carnitine</th>\n",
       "      <td>1115.800000</td>\n",
       "      <td>1715.333333</td>\n",
       "      <td>98.0</td>\n",
       "      <td>0.009629</td>\n",
       "      <td>0.027405</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Glyceric acid</th>\n",
       "      <td>54652.500000</td>\n",
       "      <td>81292.071429</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.011010</td>\n",
       "      <td>0.030552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Guanosine</th>\n",
       "      <td>2878.600000</td>\n",
       "      <td>1626.714286</td>\n",
       "      <td>319.0</td>\n",
       "      <td>0.011763</td>\n",
       "      <td>0.031847</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Xanthine</th>\n",
       "      <td>12744.100000</td>\n",
       "      <td>23927.571429</td>\n",
       "      <td>103.0</td>\n",
       "      <td>0.013408</td>\n",
       "      <td>0.034612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Phenylpyruvic acid</th>\n",
       "      <td>1038.700000</td>\n",
       "      <td>1764.214286</td>\n",
       "      <td>103.0</td>\n",
       "      <td>0.013408</td>\n",
       "      <td>0.034612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TCDCA</th>\n",
       "      <td>783.655858</td>\n",
       "      <td>1436.196180</td>\n",
       "      <td>104.0</td>\n",
       "      <td>0.014305</td>\n",
       "      <td>0.035285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TUDCA</th>\n",
       "      <td>61.526243</td>\n",
       "      <td>188.616756</td>\n",
       "      <td>104.0</td>\n",
       "      <td>0.014305</td>\n",
       "      <td>0.035285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Riboflavin</th>\n",
       "      <td>200.500000</td>\n",
       "      <td>369.666667</td>\n",
       "      <td>104.5</td>\n",
       "      <td>0.014766</td>\n",
       "      <td>0.035631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>UDCA or HDCA</th>\n",
       "      <td>29.438454</td>\n",
       "      <td>117.376424</td>\n",
       "      <td>107.0</td>\n",
       "      <td>0.017319</td>\n",
       "      <td>0.040903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kynurenic acid</th>\n",
       "      <td>1566.400000</td>\n",
       "      <td>2420.380952</td>\n",
       "      <td>110.0</td>\n",
       "      <td>0.020877</td>\n",
       "      <td>0.048277</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>p-hydroxy-hippuric acid</th>\n",
       "      <td>296.200000</td>\n",
       "      <td>612.619048</td>\n",
       "      <td>110.5</td>\n",
       "      <td>0.021525</td>\n",
       "      <td>0.048761</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               LVR_mean       NVR_mean  \\\n",
       "metabolite                                                               \n",
       "L-Citrulline                                2723.800000    6113.428571   \n",
       "L-Serine                                    4132.800000   11121.142857   \n",
       "L-Histidine                                  433.700000     955.476190   \n",
       "L-asparagine                                 839.700000    1450.380952   \n",
       "L-Methionine                                8814.600000   18713.357143   \n",
       "L-Threonine                                18041.100000   40270.000000   \n",
       "L-Proline                                   3105.500000    7522.000000   \n",
       "L-Tyrosine                                 21412.700000   41665.071429   \n",
       "L-Glutamine                                 8473.300000   16422.500000   \n",
       "serotonin                                    397.631584    3892.661440   \n",
       "hydroxyphenyllactate                        5792.300000    9667.690476   \n",
       "L-Phenylalanine                           311875.800000  537173.738095   \n",
       "L-Isoleucine                                2966.300000    5936.833333   \n",
       "L-Leucine                                    820.000000    1784.023810   \n",
       "D-pantothenic acid                          4382.900000    8797.238095   \n",
       "indole lactate                              1281.800000    2266.285714   \n",
       "xanthurenic acid                            3253.249323    6438.697378   \n",
       "N-Acetylneuraminic acid                     1928.000000    3250.357143   \n",
       "CDCA                                          73.860439     361.874311   \n",
       "L-Glutamic acid                           235614.900000  385352.714286   \n",
       "L-Aspartic Acid                            25975.100000   64397.785714   \n",
       "indole acetic acid                          4774.500000    9107.547619   \n",
       "3-Indoleacetic acid                         4778.000000    9098.642857   \n",
       "DL-2-Aminoadipic acid                        388.100000     883.119048   \n",
       "L-Kynurenine                                9827.900000   17430.357143   \n",
       "2-Deoxycytidine 5-diphosphate               2615.500000    5332.761905   \n",
       "3-phenyllactic acid                         4150.900000    7458.809524   \n",
       "5-HIAA                                      1337.464751    3904.702241   \n",
       "CA                                           446.812179    2036.955763   \n",
       "L-Tryptophan                              110050.900000  168568.619048   \n",
       "4-Pyridoxic acid                            2240.600000    4977.880952   \n",
       "L-tryptophan-D5                            26618.000000   18953.452381   \n",
       "Thymine                                      104.900000     196.333333   \n",
       "Nicotinic acid                               197.600000    1158.261905   \n",
       "Mevalonic acid                             40525.200000  153637.880952   \n",
       "Creatinine                                  2660.200000    3769.761905   \n",
       "N-Acetyl-alpha-D-glucosamine 1-phosphate    3867.500000    5942.404762   \n",
       "Orotic acid                                  661.700000    1405.928571   \n",
       "L-Carnitine                                 1115.800000    1715.333333   \n",
       "Glyceric acid                              54652.500000   81292.071429   \n",
       "Guanosine                                   2878.600000    1626.714286   \n",
       "Xanthine                                   12744.100000   23927.571429   \n",
       "Phenylpyruvic acid                          1038.700000    1764.214286   \n",
       "TCDCA                                        783.655858    1436.196180   \n",
       "TUDCA                                         61.526243     188.616756   \n",
       "Riboflavin                                   200.500000     369.666667   \n",
       "UDCA or HDCA                                  29.438454     117.376424   \n",
       "kynurenic acid                              1566.400000    2420.380952   \n",
       "p-hydroxy-hippuric acid                      296.200000     612.619048   \n",
       "\n",
       "                                          statistic   p_value     p_adj  \n",
       "metabolite                                                               \n",
       "L-Citrulline                                   22.0  0.000013  0.000827  \n",
       "L-Serine                                       23.0  0.000015  0.000827  \n",
       "L-Histidine                                    40.0  0.000083  0.002456  \n",
       "L-asparagine                                   43.5  0.000116  0.002456  \n",
       "L-Methionine                                   45.0  0.000134  0.002456  \n",
       "L-Threonine                                    46.0  0.000147  0.002456  \n",
       "L-Proline                                      48.0  0.000177  0.002456  \n",
       "L-Tyrosine                                     48.0  0.000177  0.002456  \n",
       "L-Glutamine                                    50.0  0.000213  0.002625  \n",
       "serotonin                                      52.0  0.000255  0.002820  \n",
       "hydroxyphenyllactate                           53.0  0.000279  0.002820  \n",
       "L-Phenylalanine                                56.0  0.000365  0.003379  \n",
       "L-Isoleucine                                   57.0  0.000399  0.003406  \n",
       "L-Leucine                                      58.0  0.000435  0.003452  \n",
       "D-pantothenic acid                             60.0  0.000518  0.003835  \n",
       "indole lactate                                 64.0  0.000729  0.005060  \n",
       "xanthurenic acid                               65.0  0.000794  0.005182  \n",
       "N-Acetylneuraminic acid                        66.0  0.000863  0.005321  \n",
       "CDCA                                           68.0  0.001018  0.005950  \n",
       "L-Glutamic acid                                71.0  0.001301  0.007222  \n",
       "L-Aspartic Acid                                72.0  0.001410  0.007455  \n",
       "indole acetic acid                             73.0  0.001528  0.007647  \n",
       "3-Indoleacetic acid                            74.0  0.001655  0.007647  \n",
       "DL-2-Aminoadipic acid                          75.0  0.001791  0.007647  \n",
       "L-Kynurenine                                   75.0  0.001791  0.007647  \n",
       "2-Deoxycytidine 5-diphosphate                  75.0  0.001791  0.007647  \n",
       "3-phenyllactic acid                            78.0  0.002264  0.009308  \n",
       "5-HIAA                                         79.0  0.002446  0.009696  \n",
       "CA                                             81.0  0.002849  0.010906  \n",
       "L-Tryptophan                                   85.0  0.003844  0.013765  \n",
       "4-Pyridoxic acid                               85.0  0.003844  0.013765  \n",
       "L-tryptophan-D5                               332.0  0.004787  0.016606  \n",
       "Thymine                                        89.0  0.005144  0.017304  \n",
       "Nicotinic acid                                 90.0  0.005521  0.018026  \n",
       "Mevalonic acid                                 91.0  0.005935  0.018822  \n",
       "Creatinine                                     94.0  0.007322  0.022577  \n",
       "N-Acetyl-alpha-D-glucosamine 1-phosphate       95.0  0.007848  0.023544  \n",
       "Orotic acid                                    96.5  0.008698  0.025407  \n",
       "L-Carnitine                                    98.0  0.009629  0.027405  \n",
       "Glyceric acid                                 100.0  0.011010  0.030552  \n",
       "Guanosine                                     319.0  0.011763  0.031847  \n",
       "Xanthine                                      103.0  0.013408  0.034612  \n",
       "Phenylpyruvic acid                            103.0  0.013408  0.034612  \n",
       "TCDCA                                         104.0  0.014305  0.035285  \n",
       "TUDCA                                         104.0  0.014305  0.035285  \n",
       "Riboflavin                                    104.5  0.014766  0.035631  \n",
       "UDCA or HDCA                                  107.0  0.017319  0.040903  \n",
       "kynurenic acid                                110.0  0.020877  0.048277  \n",
       "p-hydroxy-hippuric acid                       110.5  0.021525  0.048761  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_v9 = meta_matched.query(\"VisitCode == 'V9'\")\n",
    "metab_data_v9 = metab_data.loc[meta_v9.index]\n",
    "metab_stats_v9_rows = list()\n",
    "for metabolite, row in metab_data_v9.transpose().iterrows():\n",
    "    lvr_abunds = row[meta_v9.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v9.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        metab_stats_v9_rows.append([metabolite, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "metab_stats_v9 = pd.DataFrame(metab_stats_v9_rows, columns=['metabolite', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "metab_stats_v9['p_adj'] = p_adjust(metab_stats_v9['p_value'])\n",
    "metab_stats_v9 = metab_stats_v9.set_index('metabolite')\n",
    "metab_stats_v9.query('p_adj < .05')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b7e80142-407a-471f-9cdf-747d18bb1d27",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "49"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(metab_stats_v9.query('p_adj < .05'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a03d5953-d901-4d7c-83a8-ce7cb9894b88",
   "metadata": {},
   "source": [
    "49 compounds have significant adjusted p-values. o about half of things are significant. Is this an artifact or does it mean that base metabolism is associated with titer response?\n",
    "\n",
    "Tryptophan D5 is in this list. It is a control compound so we probably shouldn't trust this."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0348866c-9d90-42d7-a36c-2cff5ee4ee48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>metabolite</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>L-tryptophan-D5</th>\n",
       "      <td>26618.0</td>\n",
       "      <td>18953.452381</td>\n",
       "      <td>332.0</td>\n",
       "      <td>0.004787</td>\n",
       "      <td>0.016606</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 LVR_mean      NVR_mean  statistic   p_value     p_adj\n",
       "metabolite                                                            \n",
       "L-tryptophan-D5   26618.0  18953.452381      332.0  0.004787  0.016606"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metab_stats_v9.loc[[i for i in metab_stats_v9.index if 'D5' in i]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "360f77fa-bcb3-4639-9ab0-224d474aa94d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
